{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch17_power.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 17"
   ],
   "metadata": {
    "id": "mzfXc9E3Xq7k"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# statsmodels library for computing power\n",
    "import statsmodels.stats.power as smp\n",
    "\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ],
   "metadata": {
    "id": "3i9Jx5U-rsrx"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "rNPyhCFurso6"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Power for a one-sample t-test"
   ],
   "metadata": {
    "id": "aNU7rUQ-D65M"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "xBar = 1\n",
    "h0   = 0\n",
    "std  = 2\n",
    "n    = 42 # sample size\n",
    "alpha = .05  # significance level\n",
    "\n",
    "# Compute the non-centrality parameter\n",
    "tee = (xBar-h0) / (std/np.sqrt(n))\n",
    "\n",
    "\n",
    "# Critical t-values (2-tailed)\n",
    "df = n - 1  # df for one-sample t-test\n",
    "t_critL = stats.t.ppf(alpha/2, df) # in Equation 17.2, this is -\\tau_{\\alpha/2,df}\n",
    "t_critR = stats.t.ppf(1-alpha/2, df)\n",
    "\n",
    "# two one-sided power areas\n",
    "powerL = stats.t.cdf(t_critL+tee, df) # note shiting the distribution\n",
    "powerR = 1 - stats.t.cdf(t_critR+tee, df)\n",
    "\n",
    "# note: can also use the loc input:\n",
    "#powerL = stats.t.cdf(t_critL, df, loc=delta)\n",
    "\n",
    "# total power\n",
    "totalPower = powerL + powerR\n",
    "\n",
    "# and report\n",
    "print(f't = {tee:.3f}')\n",
    "print(f'shifted tau-left = {t_critL+tee:.3f}')\n",
    "print(f'shifted tau-right = {t_critR+tee:.3f}')\n",
    "print('')\n",
    "print(f'Statistical power: {totalPower:.4f}')"
   ],
   "metadata": {
    "id": "KI-OqlbD93Qt"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "L1M56LPa7R2S"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 17.2: Visualization of statistical power for this example"
   ],
   "metadata": {
    "id": "pNoW6XtQ--Sl"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "#### simulation parameters ####\n",
    "# this is the same code as above, but repeated here to make it easy for you to modify and explore\n",
    "\n",
    "# parameters\n",
    "xBar = 1\n",
    "h0   = 0\n",
    "std  = 2\n",
    "n    = 42 # sample size\n",
    "alpha = .05  # significance level\n",
    "\n",
    "# Compute the non-centrality parameter\n",
    "tee = (xBar-h0) / (std/np.sqrt(n))\n",
    "\n",
    "\n",
    "# Critical t-values (2-tailed)\n",
    "df = n - 1  # df for one-sample t-test\n",
    "t_critL = stats.t.ppf(alpha/2, df) # in Equation 17.2, this is -\\tau_{\\alpha/2,df}\n",
    "t_critR = stats.t.ppf(1-alpha/2, df)\n",
    "\n",
    "# two one-sided power areas\n",
    "powerL = stats.t.cdf(t_critL+tee, df) # note shiting the distribution\n",
    "powerR = 1 - stats.t.cdf(t_critR+tee, df)\n",
    "totalPower = powerL + powerR\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "########## now for the visualization ############\n",
    "\n",
    "# t-values\n",
    "tvals = np.linspace(-4,7,401)\n",
    "\n",
    "# open the figure\n",
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "# draw the distributions\n",
    "plt.plot(tvals,stats.t.pdf(tvals,n-1)*np.diff(tvals[:2]),'k',linewidth=2, label='$H_0$ distribution')\n",
    "plt.plot(tvals,stats.t.pdf(tvals-tee,n-1)*np.diff(tvals[:2]),'--',linewidth=3,color=(.7,.7,.7), label=r'$H_A$ distribution')\n",
    "\n",
    "# critical t-values at alpha=.025 (\"tau\" in the equations)\n",
    "plt.axvline(t_critL,color='k',linestyle=':',zorder=-3)\n",
    "plt.axvline(t_critR,color='k',linestyle=':',zorder=-3)\n",
    "plt.text(t_critL-.2,stats.t.pdf(0,n-1)*.9*np.diff(tvals[:2]),r'-$\\tau$/2',rotation=90,ha='center',va='center')\n",
    "plt.text(t_critR+.22,stats.t.pdf(0,n-1)*.9*np.diff(tvals[:2]),r'$\\tau$/2',rotation=90,ha='center',va='center')\n",
    "\n",
    "# fill in areas for computing 1-beta (note: basically invisible on the left side; try setting xBar=-1)\n",
    "plt.fill_between(tvals,stats.t.pdf(tvals-tee,n-1)*np.diff(tvals[:2]),where=(tvals<t_critL),color=(.7,.7,.7),alpha=.5)\n",
    "plt.fill_between(tvals,stats.t.pdf(tvals-tee,n-1)*np.diff(tvals[:2]),where=(tvals>t_critR),color=(.7,.7,.7),alpha=.5,label=fr'1-$\\beta$ = {totalPower:.3f}')\n",
    "\n",
    "plt.xlabel('T-values')\n",
    "plt.ylabel('Probability')\n",
    "plt.legend(fontsize=12)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_powerExample.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "f3lWT44N7Rzm"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "ywvzbYae7Rw1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Using statsmodels"
   ],
   "metadata": {
    "id": "qgXxKDR2DaQB"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "xBar = 1\n",
    "h0   = 0\n",
    "std  = 2 # sample standard deviation\n",
    "sampsize = 42\n",
    "alpha = .05  # significance level\n",
    "\n",
    "effectSize = (xBar-h0) / std\n",
    "power_sm = smp.TTestPower().power(\n",
    "    effect_size=effectSize, nobs=sampsize, alpha=alpha, alternative='two-sided')\n",
    "\n",
    "print(f'Statistical power using statsmodels: {power_sm:.4f}')"
   ],
   "metadata": {
    "id": "4k8-EK1W93Lg"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Re51eYgXxZ4o"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Sample size for a desired power"
   ],
   "metadata": {
    "id": "9r1R97UGxcKE"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "power = .8  # desired statistical power level (1-\\beta)\n",
    "h0    = 0   # mean if H0 is true\n",
    "xBar  = 1   # sample mean\n",
    "std   = 1.5 # sample standard deviation\n",
    "\n",
    "# effect size\n",
    "effectSize = (xBar-h0) / std\n",
    "\n",
    "# compute sample size\n",
    "sample_size = smp.TTestPower().solve_power(\n",
    "    effect_size=effectSize, alpha=.05, power=power, alternative='two-sided')\n",
    "\n",
    "# and report\n",
    "print(f'Required sample size: {round(sample_size)}')"
   ],
   "metadata": {
    "id": "Wyz7TgG-xZ1Y"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "fzo1qqxUw6dl"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "hreQGeTdI2kN"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "std  = 2\n",
    "sampsize = 41\n",
    "xBars = np.linspace(-2,2,41)\n",
    "\n",
    "# initialize results vector\n",
    "powers = np.zeros(len(xBars))\n",
    "\n",
    "# run the experiment!\n",
    "for i,xm in enumerate(xBars):\n",
    "  powers[i] = smp.TTestPower().power(effect_size=xm/std, nobs=sampsize, alpha=.05)\n",
    "\n",
    "\n",
    "# and plot the results\n",
    "plt.figure(figsize=(7,3))\n",
    "plt.plot(xBars,powers,'ks',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "plt.xlabel(r'Sample mean (H$_0$=0)')\n",
    "plt.ylabel(r'Statistical power (1-$\\beta$)')\n",
    "plt.title(r'$\\bf{A}$)  Power by sample mean')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex1a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "i0BYwJNOI5BG"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "xBar = .5\n",
    "sampleSizes = np.arange(10,251,step=10)\n",
    "\n",
    "# initialize results vector\n",
    "powers = np.zeros(len(sampleSizes))\n",
    "\n",
    "# the experiment\n",
    "for i,ss in enumerate(sampleSizes):\n",
    "  powers[i] = smp.TTestPower().power(effect_size=xBar/std, nobs=ss, alpha=.05)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "plt.figure(figsize=(7,3))\n",
    "plt.plot(sampleSizes,powers,'ks',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel(r'Statistical power (1-$\\beta$)')\n",
    "plt.title(r'$\\bf{B}$)  Power by sample size')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex1b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "fRybXAsbI2gz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "xBars = np.linspace(-2,2,41)\n",
    "sampleSizes = np.arange(10,251,step=10)\n",
    "\n",
    "# initialize the results matrix\n",
    "powers = np.zeros((len(xBars),len(sampleSizes)))\n",
    "\n",
    "\n",
    "# run the experiment (manipulate mean and N independently)\n",
    "for xi,xm in enumerate(xBars):\n",
    "  for si,ss in enumerate(sampleSizes):\n",
    "    powers[xi,si] = smp.TTestPower().power(effect_size=xm/std, nobs=ss, alpha=.05)\n",
    "\n",
    "\n",
    "# and the results...\n",
    "plt.figure(figsize=(4,5))\n",
    "plt.imshow(powers,origin='lower',extent=[sampleSizes[0],sampleSizes[-1],xBars[0],xBars[-1]],\n",
    "           aspect='auto',cmap='gray',vmin=0,vmax=1)\n",
    "plt.colorbar()\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel(r'Sample mean (H$_0$=0)')\n",
    "plt.yticks(range(-2,3))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex1c.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "-WDs34T-r7YA"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "OHedLBv4rX9P"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "4CVS9NmjJr2j"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# (Note: some variables in this exercise were defined in Exercise 1)\n",
    "# parameters\n",
    "xBars = np.linspace(-2,2,42)\n",
    "power = np.linspace(.5,.95,27)\n",
    "\n",
    "# initialize the results matrix\n",
    "sampleSizes = np.zeros((len(xBars),len(power)))\n",
    "\n",
    "\n",
    "# run the experiment (manipulate mean and N independently)\n",
    "for xi,xm in enumerate(xBars):\n",
    "  for pi,pwr in enumerate(power):\n",
    "    sampleSizes[xi,pi] = smp.TTestPower().solve_power(effect_size=xm/std, alpha=.05, power=pwr)\n",
    "\n",
    "# and the results...\n",
    "plt.figure(figsize=(4,5))\n",
    "plt.imshow(sampleSizes,origin='lower',extent=[power[0],power[-1],xBars[0]/std,xBars[-1]/std],\n",
    "           aspect='auto',cmap='gray',vmin=10,vmax=100)\n",
    "plt.colorbar()\n",
    "plt.xlabel('Desired statistical power')\n",
    "plt.ylabel(r'Effect size ($\\pm\\overline{x}/s$)')\n",
    "plt.yticks(np.arange(-1,1.1,.5))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex2a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "hISY5uSMLBCW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "# (un)comment one of these lines\n",
    "rows2plot = slice(0,len(xBars),5)\n",
    "# rows2plot = slice(0,5)\n",
    "\n",
    "\n",
    "# for the plotting\n",
    "plt.plot(power,sampleSizes[rows2plot,:].T,'s-',markerfacecolor='w',markersize=8)\n",
    "plt.legend([f'{v/std:.2f}' for v in xBars[rows2plot]],bbox_to_anchor=(1,1),title='Effect size')\n",
    "plt.xlabel('Desired power')\n",
    "plt.ylabel('Required sample size')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex2b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "KngqiuBTzVy2"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "dCDzRfSdm8-Y"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "N4hOaHlh8Lty"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "std  = 2\n",
    "sampsize = 42\n",
    "xBars = np.linspace(-2,2,41)\n",
    "\n",
    "# initialize results vector\n",
    "powers = np.zeros((len(xBars),3))\n",
    "\n",
    "# run the experiment!\n",
    "for i,xm in enumerate(xBars):\n",
    "  powers[i,0] = smp.TTestPower().power(effect_size=xm/std, nobs=sampsize, alpha=.001)\n",
    "  powers[i,1] = smp.TTestPower().power(effect_size=xm/std, nobs=sampsize, alpha=.01)\n",
    "  powers[i,2] = smp.TTestPower().power(effect_size=xm/std, nobs=sampsize, alpha=.1)\n",
    "\n",
    "# for an extra challenge, put the alpha values into a for-loop by raising 10 to higher negative powers!\n",
    "\n",
    "\n",
    "# and plot the results\n",
    "plt.figure(figsize=(9,3))\n",
    "plt.plot(xBars/std,powers[:,0],'ks',markersize=10,markerfacecolor=(.2,.2,.2),label=r'$\\alpha=.001$')\n",
    "plt.plot(xBars/std,powers[:,1],'ko',markersize=10,markerfacecolor=(.5,.5,.5),label=r'$\\alpha=.01$')\n",
    "plt.plot(xBars/std,powers[:,2],'k^',markersize=10,markerfacecolor=(.8,.8,.8),label=r'$\\alpha=.1$')\n",
    "plt.axhline(y=.8,linestyle='--',color='gray',zorder=-10,label=r'$1-\\beta=.8$')\n",
    "plt.xlabel(r'Effect size')\n",
    "plt.ylabel(r'Statistical power (1-$\\beta$)')\n",
    "plt.legend(bbox_to_anchor=(1,1))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "oPU8zznt8LSi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0XuS3pph8LMv"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "GkJfnRn34zqg"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# basic usage (this is the code I showed in the book)\n",
    "smp.TTestPower().plot_power(dep_var='nobs',nobs=np.array([10,20,50]),effect_size=np.linspace(.5,1,5))\n",
    "plt.ylabel('Statistical power')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "eZQaEldQqD_f"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "### here's the solution to the exercise\n",
    "\n",
    "# setup a figure\n",
    "_,axs = plt.subplots(1,2,figsize=(12,4))\n",
    "\n",
    "# call the power plot calculations\n",
    "smp.TTestPower().plot_power(dep_var='nobs',nobs=np.arange(5,81),effect_size=np.linspace(.5,1,5),ax=axs[0])\n",
    "smp.TTestPower().plot_power(dep_var='effect_size',nobs=np.arange(10,151,20),effect_size=np.linspace(.1,.7,25),ax=axs[1])\n",
    "\n",
    "# some plot adjustments\n",
    "axs[0].set_title(r'$\\bf{A}$)  Power by sample size and effect sizes')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Power by effect size and sample sizes')\n",
    "axs[0].set_ylabel(r'Power (1-$\\beta$)')\n",
    "axs[1].set_ylabel(r'Power (1-$\\beta$)')\n",
    "\n",
    "# fix strange issue of sample sizes printing as 10.00\n",
    "axs[1].legend([l[:-3] for l in axs[1].get_legend_handles_labels()[1]])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "o6alJWkLQInp"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "s-xnEUQ6VGzS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "qszJ18ntVUdt"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# simulation parameters\n",
    "effectSize = .6  # group mean differences, divided by standard deviation.\n",
    "n1 = 50          # sample size of group 1.\n",
    "ssRatio = 2      # Ratio of sample size of group 2 to group 1. 1 means equal sample sizes.\n",
    "\n",
    "# Note about sample size parameters (text below is taken from https://www.statsmodels.org/dev/generated/statsmodels.stats.power.TTestIndPower.power.html#statsmodels.stats.power.TTestIndPower.power)\n",
    "# n1: number of observations of sample 1. The number of observations of sample two\n",
    "# is ratio times the size of sample 1, i.e. nobs2 = nobs1 * ratio\n",
    "\n",
    "# Compute power\n",
    "power = smp.TTestIndPower().power(effect_size=effectSize, nobs1=n1, alpha=.05, ratio=ssRatio)\n",
    "print(f'Total sample size is {n1}+{n1*ssRatio}={n1+n1*ssRatio}, power is {power:.2f}')"
   ],
   "metadata": {
    "id": "ntOXjK7v4x0F"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# the total sample size\n",
    "totalN = 100\n",
    "\n",
    "# sample sizes in group 1\n",
    "n1sampleSizes = np.arange(10,91,5)\n",
    "\n",
    "# initialize results vector\n",
    "powers = np.zeros(len(n1sampleSizes))\n",
    "\n",
    "\n",
    "# run the simulation!\n",
    "for i,n1 in enumerate(n1sampleSizes):\n",
    "\n",
    "  # calculate the n2 sample size\n",
    "  n2 = totalN-n1\n",
    "\n",
    "  # the ratio\n",
    "  sr = n2/n1\n",
    "\n",
    "  # compute and store power\n",
    "  powers[i] = smp.TTestIndPower().power(effect_size=.6, nobs1=n1, alpha=.05, ratio=sr)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "plt.figure(figsize=(8,3))\n",
    "plt.plot(n1sampleSizes,powers,'ks-',markersize=9,markerfacecolor=(.8,.8,.8))\n",
    "plt.xlabel('Observations in group 1')\n",
    "plt.ylabel('Statistical power')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('power_ex5.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Ub3s8NjiC8qz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Qvm89BGcQItF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "i6uPyzeNqVJe"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# population and sample parameters\n",
    "mu    = .8\n",
    "sigma = 1.8\n",
    "n     = 38\n",
    "\n",
    "# critical t-values (2-tailed)\n",
    "t_critL = stats.t.ppf(.05/2, n-1)\n",
    "t_critR = stats.t.ppf(1-.05/2, n-1)\n",
    "\n",
    "# simulation parameters\n",
    "num_simulations = 10000\n",
    "rejectH0 = 0 # initialize a counter for when H0 was rejected\n",
    "\n",
    "\n",
    "# run the experiment!\n",
    "for _ in range(num_simulations):\n",
    "\n",
    "  # draw a sample from a population with known parameters\n",
    "  sample = np.random.normal(mu,sigma,n)\n",
    "  sample_mean = np.mean(sample)\n",
    "  sample_se = np.std(sample,ddof=1) / np.sqrt(n)\n",
    "\n",
    "  # Calculate the t-statistic\n",
    "  tVal = sample_mean / sample_se\n",
    "\n",
    "  # Check if t-stat falls into the rejection region\n",
    "  if tVal<t_critL or tVal>t_critR:\n",
    "    rejectH0 += 1\n",
    "\n",
    "# Estimate empirical power (percent of simulations where H0 was rejected)\n",
    "powerEm = 100 * rejectH0/num_simulations\n",
    "\n",
    "## compute analytic power from formula\n",
    "effectSize = mu / sigma # using population parameters\n",
    "powerAn = 100 * smp.TTestPower().power(effect_size=effectSize, nobs=n, alpha=.05)\n",
    "\n",
    "# print the results\n",
    "print(f'Theoretical power from formula:   {powerAn:.3f}%')\n",
    "print(f'Empirical power from simulations: {powerEm:.3f}%')"
   ],
   "metadata": {
    "id": "3MXHyUWAI2n1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "### Note about the code in the previous cell: I wrote out the mechanics of the t-test so you\n",
    "#   could see the link to the formula for computing statistical power. In practice, it's simpler\n",
    "#   to use the ttest function in scipy. The code below produces the same result using less code.\n",
    "\n",
    "\n",
    "# re-initialize the counter!\n",
    "rejectH0 = 0\n",
    "\n",
    "# run the experiment!\n",
    "for _ in range(num_simulations):\n",
    "\n",
    "  # draw a sample from a population with known parameters\n",
    "  sample = np.random.normal(mu,sigma,n)\n",
    "\n",
    "  # up the counter if the t-value is significant\n",
    "  if stats.ttest_1samp(sample,0).pvalue<.05:\n",
    "    rejectH0 += 1\n",
    "\n",
    "# Estimate empirical power (percent of simulations where H0 was rejected)\n",
    "powerEm = 100 * rejectH0/num_simulations\n",
    "\n",
    "## compute analytic power from formula\n",
    "effectSize = mu / sigma # using population parameters\n",
    "powerAn = 100 * smp.TTestPower().power(effect_size=effectSize, nobs=n, alpha=.05)\n",
    "\n",
    "# print the results\n",
    "print(f'Analytical power from formula:    {powerAn:.3f}%')\n",
    "print(f'Empirical power from simulations: {powerEm:.3f}%')"
   ],
   "metadata": {
    "id": "4XODkrZGHt4G"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lo5A-laaQVpS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7"
   ],
   "metadata": {
    "id": "vw0JSGhU4mIq"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# population and sample parameters\n",
    "mu    = .8\n",
    "sigma = 1.8\n",
    "n     = 38\n",
    "\n",
    "# simulation parameters\n",
    "num_simulations = 10000\n",
    "rejectH0 = 0 # initialize a counter for when H0 was rejected\n",
    "\n",
    "# run the experiment!\n",
    "for _ in range(num_simulations):\n",
    "\n",
    "  # draw a sample from a population with known parameters\n",
    "  sample = np.exp( np.random.normal(mu,sigma,n) )\n",
    "\n",
    "  # up the counter if the t-value is significant\n",
    "  if stats.ttest_1samp(sample,0).pvalue<.05:\n",
    "    rejectH0 += 1\n",
    "\n",
    "# Estimate empirical power (percent of simulations where H0 was rejected)\n",
    "powerEm = 100 * rejectH0/num_simulations\n",
    "\n",
    "## compute analytic power from formula\n",
    "popMean = np.exp(mu + sigma**2/2)\n",
    "popStd  = np.exp(mu + sigma**2/2) * np.sqrt(np.exp(sigma**2)-1)\n",
    "effectSize = popMean / popStd # using population parameters\n",
    "powerAn = 100 * smp.TTestPower().power(effect_size=effectSize, nobs=n, alpha=.05)\n",
    "\n",
    "# print the results\n",
    "print(f'Analytical power from formula:    {powerAn:.3f}%')\n",
    "print(f'Empirical power from simulations: {powerEm:.3f}%')"
   ],
   "metadata": {
    "id": "nsvna5AM4lPU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0jloQmRS4soe"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 8"
   ],
   "metadata": {
    "id": "mt0AuaAT_q-I"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "## repeat using Wilcoxon test against H0=0\n",
    "\n",
    "# initialize a counter for when H0 was rejected\n",
    "rejectH0 = 0\n",
    "\n",
    "# run the experiment!\n",
    "for _ in range(num_simulations):\n",
    "\n",
    "  # draw a sample from a population with known parameters\n",
    "  sample = np.exp( np.random.normal(mu,sigma,n) )\n",
    "  W = stats.wilcoxon(sample)\n",
    "  if W.pvalue<.05:\n",
    "    rejectH0 += 1\n",
    "\n",
    "# Estimate empirical power (percent of simulations where H0 was rejected)\n",
    "powerEm = 100 * rejectH0/num_simulations\n",
    "\n",
    "# print the results\n",
    "print(f'Analytical power from simulations: {powerEm:.3f}%')"
   ],
   "metadata": {
    "id": "6W8G9E7-4slr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "plt.plot(sample,'s');"
   ],
   "metadata": {
    "id": "s5seChb1P6LY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# now using a more reasonable H0 value\n",
    "h0 = 10\n",
    "\n",
    "# don't forget to keep resetting this counter!\n",
    "rejectH0 = 0\n",
    "\n",
    "# run the experiment!\n",
    "for _ in range(num_simulations):\n",
    "\n",
    "  # draw a sample from a population with known parameters\n",
    "  sample = np.exp( np.random.normal(mu,sigma,n) )\n",
    "  W = stats.wilcoxon(sample-h0)\n",
    "  if W.pvalue<.05:\n",
    "    rejectH0 += 1\n",
    "\n",
    "# Estimate empirical power (percent of simulations where H0 was rejected)\n",
    "powerEm = 100 * rejectH0/num_simulations\n",
    "\n",
    "# print the results\n",
    "print(f'Empirical power from simulations: {powerEm:.3f}%')"
   ],
   "metadata": {
    "id": "kGpJqahN4si6"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Kn-PNQRb4sf-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 9"
   ],
   "metadata": {
    "id": "SHtJA3YbQ5lI"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# import and process data (take from Exercise 11.12)\n",
    "\n",
    "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv\"\n",
    "data = pd.read_csv(url,sep=';')\n",
    "\n",
    "# which columns to t-test\n",
    "cols2test = data.keys()\n",
    "cols2test = cols2test.drop('quality')\n",
    "\n",
    "# create a new column for binarized (boolean) quality\n",
    "data['boolQuality'] = False\n",
    "data['boolQuality'][data['quality']>5] = True"
   ],
   "metadata": {
    "id": "8IUS432VQ4uV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# statistical threshold (Bonferroni-corrected)\n",
    "bonP = .05/len(cols2test)\n",
    "\n",
    "\n",
    "# loop over column\n",
    "for col in cols2test:\n",
    "\n",
    "  # for convenience, extract the numerical variables\n",
    "  Xh = data[col][data['boolQuality']==True].values  # high rating\n",
    "  Xl = data[col][data['boolQuality']==False].values # low rating\n",
    "\n",
    "  # sample size and ratio\n",
    "  nh = len(Xh)\n",
    "  nl = len(Xl)\n",
    "  sr = nh/nl\n",
    "\n",
    "  # effect size (es)\n",
    "  es_num = np.mean(Xh)-np.mean(Xl)\n",
    "  es_den = np.sqrt( ( (nh-1)*np.var(Xh,ddof=1)+(nl-1)*np.var(Xl,ddof=1) )/(nh+nl-2) )\n",
    "\n",
    "  # compute power\n",
    "  power = smp.TTestIndPower().power(effect_size=es_num/es_den, nobs1=nh, alpha=bonP, ratio=sr)\n",
    "\n",
    "  # run the t-test\n",
    "  tres = stats.ttest_ind(Xh,Xl,equal_var=False)\n",
    "\n",
    "  # print the results\n",
    "  print(f'{col:>20}: t({tres.df:.0f})={tres.statistic:6.2f}, p={tres.pvalue:.3f}, power={power:.3f}')"
   ],
   "metadata": {
    "id": "g-d9YSer4sdQ"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "At1AkI6E4sac"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 10"
   ],
   "metadata": {
    "id": "_2hsfPjBWZYe"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# the examples I showed in the book\n",
    "smp.TTestIndPower().solve_power(effect_size=1,alpha=.05,power=.8,nobs1=50,ratio=None)\n",
    "smp.TTestIndPower().solve_power(effect_size=1,alpha=.05,power=.8,nobs1=None,ratio=1)"
   ],
   "metadata": {
    "id": "R3KMXZHVj8pW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# statistical threshold (Bonferroni-corrected)\n",
    "bonP = .05/len(cols2test)\n",
    "\n",
    "\n",
    "# loop over column\n",
    "for col in cols2test:\n",
    "\n",
    "  # for convenience, extract the numerical variables\n",
    "  Xh = data[col][data['boolQuality']==True].values  # high rating\n",
    "  Xl = data[col][data['boolQuality']==False].values # low rating\n",
    "\n",
    "  # sample size and ratio\n",
    "  nh = len(Xh)\n",
    "  nl = len(Xl)\n",
    "  sr = nh/nl\n",
    "\n",
    "  # effect size (es)\n",
    "  es_num = np.mean(Xh)-np.mean(Xl)\n",
    "  es_den = np.sqrt( ( (nh-1)*np.var(Xh,ddof=1)+(nl-1)*np.var(Xl,ddof=1) )/(nh+nl-2) )\n",
    "\n",
    "  # compute power\n",
    "  try:\n",
    "    nl_sr = smp.TTestIndPower().solve_power(\n",
    "        effect_size=es_num/es_den, alpha=bonP, power=.8, nobs1=nh, ratio=None)\n",
    "\n",
    "    # print the results\n",
    "    print(f'{col:>20}: N-high: {nh}, N-low: {int(nh*nl_sr):>3}')\n",
    "\n",
    "  except:\n",
    "    print(f'{col:>20}: ** Does not compute! **')"
   ],
   "metadata": {
    "id": "NllC2xIE4sXu"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "nl_sr"
   ],
   "metadata": {
    "id": "11h3lLZEpsIc"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 11"
   ],
   "metadata": {
    "id": "nnyVkgQ9psgy"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# statistical threshold (Bonferroni-corrected)\n",
    "bonP = .05/len(cols2test)\n",
    "\n",
    "\n",
    "# loop over column\n",
    "for col in cols2test:\n",
    "\n",
    "  # for convenience, extract the numerical variables\n",
    "  Xh = data[col][data['boolQuality']==True].values  # high rating\n",
    "  Xl = data[col][data['boolQuality']==False].values # low rating\n",
    "\n",
    "  # sample size and ratio\n",
    "  nh = len(Xh)\n",
    "  nl = len(Xl)\n",
    "  sr = nh/nl\n",
    "\n",
    "  # effect size (es)\n",
    "  es_num = np.mean(Xh)-np.mean(Xl)\n",
    "  es_den = np.sqrt( ( (nh-1)*np.var(Xh,ddof=1)+(nl-1)*np.var(Xl,ddof=1) )/(nh+nl-2) )\n",
    "\n",
    "  # compute power\n",
    "  try:\n",
    "    nh_r = smp.TTestIndPower().solve_power(\n",
    "        effect_size=es_num/es_den, alpha=bonP, power=.8, nobs1=None, ratio=sr)\n",
    "\n",
    "    # print the results\n",
    "    print(f'{col:>20}: N-high: {int(nh_r):>7}, N-low: {int(nh_r*sr)}')\n",
    "\n",
    "  except:\n",
    "    print(f'{col:>20}: ** Does not compute! **')"
   ],
   "metadata": {
    "id": "1YO_zITZWke5"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "WsVp_8TdWkSX"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}